library(data.table)
library(dplyr)
library(tidyr)
library(ggplot2)
library(ggrepel)

Introduction

This report analyzes DeepSEA functional annotations for fine-mapped eQTLs (LMM), comparing TST-responsive genes vs non-TST genes.

# Load eigenMT results
cat("Loading eigenMT results...\n")
## Loading eigenMT results...
eGenes <- fread(params$eigenMT_path)
eGenes_sig <- eGenes[eGenes$BF.FDR < params$FDR_threshold, ]
cat("Significant eGenes (FDR <", params$FDR_threshold, "):", nrow(eGenes_sig), "\n")
## Significant eGenes (FDR < 0.05 ): 7411
cat("Unique genes:", length(unique(eGenes_sig$gene)), "\n")
## Unique genes: 7411
cat("Unique SNPs:", length(unique(eGenes_sig$SNP)), "\n")
## Unique SNPs: 7189
# Load TST genes
cat("Loading TST genes...\n")
## Loading TST genes...
TST <- fread(params$TST_genes_path)
TST_filtered <- TST[TST$MaxFC >= params$FC_threshold, ]
cat("TST genes (FC >=", params$FC_threshold, "):", nrow(TST_filtered), "\n")
## TST genes (FC >= 0.58 ): 5526
# Split into TST and non-TST eGenes
eGenes_TST <- eGenes_sig[eGenes_sig$gene %in% TST_filtered$HGNC, ]
eGenes_nonTST <- eGenes_sig[!eGenes_sig$gene %in% TST_filtered$HGNC, ]

cat("TST eGenes:", nrow(eGenes_TST), "\n")
## TST eGenes: 1719
cat("Non-TST eGenes:", nrow(eGenes_nonTST), "\n")
## Non-TST eGenes: 5692
# Check for duplicated SNPs
dup_snps <- eGenes_sig[duplicated(eGenes_sig$SNP), ]
cat("Duplicated SNPs:", nrow(dup_snps), "\n")
## Duplicated SNPs: 222
cat("Loading DeepSEA 40 sequence class data...\n")
## Loading DeepSEA 40 sequence class data...
if (file.exists(params$deepsea_40seq_path)) {
  DeepSEA_40 <- fread(params$deepsea_40seq_path)
  cat("DeepSEA 40 sequence class data dimensions:", dim(DeepSEA_40), "\n")
  
  # Merge with TST eGenes
  DeepSEA_TST <- merge(eGenes_TST, DeepSEA_40, by.x = "SNP", by.y = "id")
  cat("Merged TST eGenes with DeepSEA:", nrow(DeepSEA_TST), "\n")
  
  # Get sequence class columns (assuming they start at column 18)
  seq_cols <- colnames(DeepSEA_TST)[18:57]
  cat("Number of sequence classes:", length(seq_cols), "\n")
  cat("First few sequence classes:", head(seq_cols), "\n")
} else {
  stop("DeepSEA 40 sequence class file not found: ", params$deepsea_40seq_path)
}
## DeepSEA 40 sequence class data dimensions: 7189 47 
## Merged TST eGenes with DeepSEA: 1719 
## Number of sequence classes: 40 
## First few sequence classes: PC1 Polycomb / Heterochromatin L1 Low signal TN1 Transcription TN2 Transcription L2 Low signal E1 Stem cell

Correlation Analysis for 40 Sequence Classes

# Perform correlation analysis for each sequence class
cat("Performing correlation analysis for 40 sequence classes...\n")
## Performing correlation analysis for 40 sequence classes...
cor_test_estimate <- as.data.frame(sapply(DeepSEA_TST[, seq_cols, with = FALSE], function(x) {
  cor.test(x, y = DeepSEA_TST$beta, method = "spearman")$estimate
}))

cor_test_p <- as.data.frame(sapply(DeepSEA_TST[, seq_cols, with = FALSE], function(x) {
  cor.test(x, y = DeepSEA_TST$beta, method = "spearman")$p.value
}))

cor_results <- cbind(cor_test_estimate, cor_test_p)
colnames(cor_results) <- c("spearman.r", "p.value")
cor_results$class <- gsub(".rho", "", rownames(cor_results))
cor_results$fdr <- p.adjust(cor_results$p.value, method = "fdr", n = length(cor_results$p.value))

# Add significance annotation
cor_results$sig <- ifelse(cor_results$fdr < 0.05, "sig", "no")
cat("Significant correlations (FDR < 0.05):", sum(cor_results$fdr < 0.05), "\n")
## Significant correlations (FDR < 0.05): 17
# Plot correlation results
cat("Plotting correlation results...\n")
## Plotting correlation results...
ggplot(cor_results, aes(x = spearman.r, y = -log10(fdr), fill = sig)) +
  scale_fill_manual(values = c("grey", "deepskyblue3")) +
  geom_point(size = 3, shape = 21, colour = "grey4") + 
  theme_classic() +
  ylab("-Log10 Adjusted P-value") + 
  xlab("Spearman Correlation Coefficient") +
  geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "grey50") + 
  geom_vline(xintercept = 0, linetype = "dashed", color = "grey50") +
  ggrepel::geom_text_repel(
    data = cor_results[cor_results$fdr < 0.05, ], 
    aes(label = class), 
    max.overlaps = 100,
    point.padding = unit(0.2, "lines"),
    min.segment.length = 0,
    size = 3
  ) + 
  theme(legend.position = "none") +
  ggtitle("DeepSEA 40 Sequence Class Correlation with eQTL Effect Sizes\n(TST Genes)")

Correlation Analysis for DeepSEA 21,907 features

cat("Loading DeepSEA 21,907 features data...\n")
## Loading DeepSEA 21,907 features data...
if (file.exists(params$deepsea_features_path)) {
  DeepSEA_features <- fread(params$deepsea_features_path)
  cat("DeepSEA features data dimensions:", dim(DeepSEA_features), "\n")
  
  # Merge with non-TST eGenes
  DeepSEA_TST <- merge(eGenes_TST, DeepSEA_features, by.x = "SNP", by.y = "id")
  cat("Merged non-TST eGenes with DeepSEA features:", nrow(DeepSEA_TST), "\n")
  
  # Assuming features start at column 19 onwards
  feature_cols <- colnames(DeepSEA_TST)[19:ncol(DeepSEA_TST)]
  cat("Number of features:", length(feature_cols), "\n")
} else {
  stop("DeepSEA features file not found: ", params$deepsea_features_path)
}
## DeepSEA features data dimensions: 7189 21915 
## Merged non-TST eGenes with DeepSEA features: 1719 
## Number of features: 21907
cat("Performing correlation analysis for features (this may take a while)...\n")
## Performing correlation analysis for features (this may take a while)...
# For large feature sets, we might want to sample or process in chunks
# Here we'll process all features but be aware it might be computationally intensive

cor_test_estimate_features <- as.data.frame(sapply(DeepSEA_TST[, feature_cols, with = FALSE], function(x) {
  cor.test(x, y = DeepSEA_TST$beta, method = "spearman")$estimate
}))

cor_test_p_features <- as.data.frame(sapply(DeepSEA_TST[, feature_cols, with = FALSE], function(x) {
  cor.test(x, y = DeepSEA_TST$beta, method = "spearman")$p.value
}))

cor_features <- cbind(cor_test_estimate_features, cor_test_p_features)
colnames(cor_features) <- c("spearman.r", "p.value")
cor_features$class <- gsub(".rho", "", rownames(cor_features))

# Parse class information
cor_features <- cor_features %>%
  tidyr::separate(class, c("Cell.type", "chromatin.type", "dataset.id"), sep = "[|]", remove = FALSE)

cor_features$fdr <- p.adjust(cor_features$p.value, method = "fdr", n = length(cor_features$p.value))

cat("Significant feature correlations (FDR < 0.05):", sum(cor_features$fdr < 0.05), "\n")
## Significant feature correlations (FDR < 0.05): 8018

Volcano Plot for Top Features

# Select top feature per chromatin type and cell type
cor_features$id <- paste0(cor_features$chromatin.type, "-", cor_features$Cell.type)
cor_features_top <- setDT(cor_features)[, .SD[which.min(p.value)], by = id]

# Add significance annotation
cor_features_top$sig <- ifelse(cor_features_top$fdr < 0.05, "yes", "no")

cat("Top features after grouping:", nrow(cor_features_top), "\n")
## Top features after grouping: 8618
cat("Significant top features:", sum(cor_features_top$fdr < 0.05), "\n")
## Significant top features: 3917
# Filter top 10 positive and negative by spearman.r
top_pos <- cor_features_top %>%
  filter(fdr < 0.05) %>%
  arrange(desc(spearman.r)) %>%
  slice_head(n = 10)

top_neg <- cor_features_top %>%
  filter(fdr < 0.05) %>%
  arrange(spearman.r) %>%
  slice_head(n = 10)

# Plot volcano plot
cat("Plotting volcano plot for top features...\n")
## Plotting volcano plot for top features...
ggplot(cor_features_top, aes(x = spearman.r, y = -log10(fdr), fill = sig)) +
  scale_fill_manual(values = c("grey", "deepskyblue3")) +
  geom_point(size = 2, shape = 21, colour = "grey4") +
  theme_classic() +
  ylab("-Log10 Adjusted P-value") +
  xlab("Spearman Correlation Coefficient") +
  geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "grey50") +
  geom_vline(xintercept = 0, linetype = "dashed", color = "grey50") +
  coord_cartesian(xlim = c(-0.2, 0.2)) +
  
  # Label top 10 positive (right)
  geom_text_repel(
    data = top_pos,
    aes(label = id),
    hjust = 1,
    direction = "y",
    nudge_x = 0.2,
    point.padding = 0,
    size = 2
  ) +
  # Label top 10 negative (left)
  geom_text_repel(
    data = top_neg,
    aes(label = id),
    hjust = 0,
    direction = "y",
    nudge_x = -0.2,
    point.padding = 0,
    size = 2
  ) +
  theme(legend.position = "none") +
  ggtitle("DeepSEA Feature Correlation with eQTL Effect Sizes\n(TST Genes, Top per Cell Type + Chromatin Type)")

Comparison of average predicted effects

cat("Comparing average predicted effects between TST and non-TST eQTLs...\n")
## Comparing average predicted effects between TST and non-TST eQTLs...
# Add TST annotation to DeepSEA data
DeepSEA_40$TSTs <- ifelse(DeepSEA_40$id %in% eGenes_TST$SNP, "TST", "non-TST")
cat("TST SNPs in DeepSEA data:", sum(DeepSEA_40$TSTs == "TST"), "\n")
## TST SNPs in DeepSEA data: 1703
cat("Non-TST SNPs in DeepSEA data:", sum(DeepSEA_40$TSTs == "non-TST"), "\n")
## Non-TST SNPs in DeepSEA data: 5486
# Calculate average scores per group
plot_data <- DeepSEA_40
chromatin_cols <- colnames(plot_data)[which(colnames(plot_data) == "PC1 Polycomb / Heterochromatin"):which(colnames(plot_data) == "HET6 Centromere")]

avg_df <- plot_data %>%
  group_by(TSTs) %>%
  summarise(across(all_of(chromatin_cols), mean, na.rm = TRUE)) %>%
  pivot_longer(-TSTs, names_to = "Chromatin_State", values_to = "Average_Score")

# Reorder factor levels
avg_df$TSTs <- factor(avg_df$TSTs, levels = c("TST", "non-TST"))

cat("Average scores calculated for", length(unique(avg_df$Chromatin_State)), "chromatin states\n")
## Average scores calculated for 40 chromatin states
# Plot average scores comparison
ggplot(avg_df, aes(x = Chromatin_State, y = Average_Score, colour = TSTs)) +
  geom_point(size = 3) + 
  scale_color_manual(values = c("TST" = "darkorange", "non-TST" = "cyan4")) +
  facet_grid(TSTs ~ .) +
  labs(
    x = "Sequence Class",
    y = "Average Predicted Effect"
  ) + 
  theme_bw() +
  theme(
    axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size = 8),
    panel.grid.minor = element_blank(),
    panel.border = element_blank(),
    legend.position = "none",
    strip.background = element_blank(),
    strip.text = element_text(size = 10, face = "bold")
  ) +
  ggtitle("Average DeepSEA Predicted Effects by Sequence Class\nTST vs Non-TST eQTLs")

sessionInfo()
## R version 4.3.2 (2023-10-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Rocky Linux 8.10 (Green Obsidian)
## 
## Matrix products: default
## BLAS/LAPACK: FlexiBLAS OPENBLAS;  LAPACK version 3.11.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Europe/London
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] ggrepel_0.9.6     ggplot2_4.0.0     tidyr_1.3.1       dplyr_1.1.4      
## [5] data.table_1.16.2
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.6       jsonlite_1.8.7     highr_0.10         compiler_4.3.2    
##  [5] Rcpp_1.0.13-1      tidyselect_1.2.1   jquerylib_0.1.4    scales_1.4.0      
##  [9] yaml_2.3.7         fastmap_1.1.1      R6_2.5.1           labeling_0.4.3    
## [13] generics_0.1.3     knitr_1.45         tibble_3.2.1       bslib_0.5.1       
## [17] pillar_1.9.0       RColorBrewer_1.1-3 R.utils_2.12.3     rlang_1.1.4       
## [21] utf8_1.2.4         cachem_1.0.8       xfun_0.54          sass_0.4.7        
## [25] S7_0.2.0           cli_3.6.3          withr_3.0.2        magrittr_2.0.3    
## [29] digest_0.6.33      grid_4.3.2         lifecycle_1.0.4    R.oo_1.27.0       
## [33] R.methodsS3_1.8.2  vctrs_0.6.5        evaluate_0.23      glue_1.8.0        
## [37] farver_2.1.2       fansi_1.0.6        rmarkdown_2.25     purrr_1.0.2       
## [41] tools_4.3.2        pkgconfig_2.0.3    htmltools_0.5.7